#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(expss)
library(polycor)
library(foreach) ; library(doParallel)
library(knitr)
library(biomaRt)
library(anRichment) ; library(BrainDiseaseCollection)
suppressWarnings(suppressMessages(library(WGCNA)))

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load preprocessed dataset (preprocessing code in 20_03_03_data_preprocessing.Rmd) and clustering (pipeline in 20_03_31_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=genes_info$ID, mart=mart)

genes_info = genes_info %>% left_join(gene_names, by=c('ID'='ensembl_gene_id'))


clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]

dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]


rm(DE_info, GO_annotations, clusterings, getinfo, mart, dds)


Relation to external clinical traits


Quantifying module-trait associations


Using the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.

datTraits = datMeta %>% dplyr::select(Diagnosis, brain_lobe, Gender, Age, PMI, SiteHM) %>%
            dplyr::rename('ExtractionBatch' = SiteHM)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}

rm(ME_object)
# Calculate the correlation tha failed with hetcor()
missing_modules = rownames(moduleTraitCor)[is.na(moduleTraitCor[,1])]

for(m in missing_modules){
  cat(paste0('Correcting Module-Diagnosis correlation for Module ', m))
  moduleTraitCor[m,'Diagnosis'] = polyserial(MEs[,m], datTraits$Diagnosis)
}

rm(missing_modules)

I’m going to select all the modules that have an absolute correlation higher than 0.8 with Diagnosis to study them

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
                           'MTcor' = moduleTraitCor[,'Diagnosis'],
                           'MTpval' = moduleTraitPvalue[,'Diagnosis'])

genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')

rm(moduleTraitPvalue, datTraits, textMatrix, diagnosis_cor)


Studying the modules with the highest absolute correlation to Diagnosis


top_modules = gsub('ME','',rownames(moduleTraitCor)[abs(moduleTraitCor[,'Diagnosis'])>0.8])

cat(paste0('Top modules selected: ', paste(top_modules, collapse=', '),'\n'))
## Top modules selected: #89AC00, #00C0BE

The modules consist mainly of points with very high (absolute) values in PC2 (which we know is related to lfc), so this result is consistent with the high correlation between Module and Diagnosis, although some of the points with the highest PC2 values do not belong to these top modules

The module with the positive correlation to Diagnosis is very big

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>%
            dplyr::select(ID, external_gene_id, PC1, PC2, Module, gene.score) %>%
            mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
            mutate(color = ifelse(ImportantModules=='Others','gray',ImportantModules),
                   alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
                   gene_id = paste0(ID, ' (', external_gene_id, ')'))

table(plot_data$ImportantModules)
## 
## #00C0BE #89AC00  Others 
##     774     305   12619
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) + 
         geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() + 
           ggtitle('Modules with strongest relation to Diagnosis'))
rm(pca)




Module Membership vs Gene Significance


create_plot = function(module){
  
  plot_data = dataset %>% dplyr::select(ID, paste0('MM.',gsub('#','',module)), GS, gene.score) %>% filter(dataset$Module==module)
  colnames(plot_data)[2] = 'Module'
  
  SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='None'])))
  
  p = ggplotly(plot_data %>% ggplot(aes(Module, GS, color=gene.score)) + geom_point(alpha=0.5, aes(ID=ID)) + ylab('Gene Significance') +
               scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,8))) + theme_minimal() + xlab('Module Membership') +
               ggtitle(paste0('Module ', module,'  (MTcor = ', round(moduleTraitCor[paste0('ME',module),1],2),')')))
  
  return(p)
}

create_plot(top_modules[1])
create_plot(top_modules[2])
rm(create_plot)




SFARI Genes


List of top SFARI Genes in top modules ordered by SFARI score and Gene Significance

table_data = dataset %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>%
             dplyr::select(ID, external_gene_id, GS, gene.score, Module) %>% arrange(gene.score, desc(abs(GS))) %>%
             dplyr::rename('Ensembl ID'=ID, 'Gene Symbol'=external_gene_id, 
                    'SFARI score'=gene.score, 'Gene Significance'=GS)

kable(table_data %>% filter(Module == top_modules[1] & `SFARI score` %in% c(1,2,3)) %>% dplyr::select(-Module),
      caption=paste0('Top SFARI Genes for Module ', top_modules[1]))
Top SFARI Genes for Module #89AC00
Ensembl ID Gene Symbol Gene Significance SFARI score
ENSG00000114861 FOXP1 0.5410850 2
ENSG00000180370 PAK2 0.5228668 3
ENSG00000076716 GPC4 0.4859893 3
ENSG00000116117 PARD3B 0.4590885 3
ENSG00000008083 JARID2 0.4354733 3
ENSG00000128573 FOXP2 0.2906097 3
ENSG00000180914 OXTR 0.2483673 3
kable(table_data %>% filter(Module == top_modules[2] & `SFARI score` %in% c(1,2,3)) %>% dplyr::select(-Module),
      caption=paste0('Top SFARI Genes for Module ', top_modules[2]))
Top SFARI Genes for Module #00C0BE
Ensembl ID Gene Symbol Gene Significance SFARI score
ENSG00000197283 SYNGAP1 -0.4651354 1
ENSG00000139613 SMARCC2 -0.1671692 2
ENSG00000136854 STXBP1 -0.6297579 3
ENSG00000124140 SLC12A5 -0.5451356 3
ENSG00000166313 APBB1 -0.4388367 3
ENSG00000149972 CNTN5 -0.4131290 3
ENSG00000173064 HECTD4 -0.3721290 3
ENSG00000152910 CNTNAP4 -0.3575881 3
ENSG00000186153 WWOX -0.3288583 3
ENSG00000138411 HECW2 -0.3252800 3
ENSG00000198793 MTOR -0.2739514 3
ENSG00000126012 KDM5C -0.1942812 3
ENSG00000204406 MBD5 -0.1759133 3
ENSG00000089006 SNX5 -0.0944326 3
ENSG00000103197 TSC2 -0.0890664 3

Modules with the strongest module-diagnosis correlation should have the highest percentage of SFARI Genes, but this doesn’t seem to be the case

plot_data = dataset %>% mutate('hasSFARIscore' = gene.score!='None') %>% 
            group_by(Module, MTcor, hasSFARIscore) %>% summarise(p=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(n=n()), by='Module') %>% 
            mutate(p=round(p/n*100,2)) 

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(this_row$hasSFARIscore==FALSE & this_row$p==100){
    new_row = this_row
    new_row$hasSFARIscore = TRUE
    new_row$p = 0
    plot_data = plot_data %>% rbind(new_row)
  }
}

plot_data = plot_data %>% filter(hasSFARIscore==TRUE)

ggplotly(plot_data %>% ggplot(aes(MTcor, p, size=n)) + geom_smooth(color='gray', se=FALSE) +
         geom_point(color=plot_data$Module, alpha=0.5, aes(id=Module)) + geom_hline(yintercept=mean(plot_data$p), color='gray') +
         xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
         theme_minimal() + theme(legend.position = 'none'))
rm(i, this_row, new_row, plot_data)

Breaking the SFARI genes by score

Perhaps the groups are too small to get a significant result

scores = c(1,2,3,4,5,6,'None')

plot_data = dataset %>% group_by(Module, MTcor, gene.score) %>% summarise(n=n()) %>% 
            left_join(dataset %>% group_by(Module) %>% summarise(N=n()), by='Module') %>% 
            mutate(p=round(n/N*100,2), gene.score = as.character(gene.score))

for(i in 1:nrow(plot_data)){
  this_row = plot_data[i,]
  if(sum(plot_data$Module == this_row$Module)<7){
    missing_scores = which(! scores %in% plot_data$gene.score[plot_data$Module == this_row$Module])
    for(s in missing_scores){
      new_row = this_row
      new_row$gene.score = as.character(s)
      new_row$n = 0
      new_row$p = 0
      plot_data = plot_data %>% rbind(new_row) 
    }
  }
}

plot_data = plot_data %>% filter(gene.score != 'None')

plot_function = function(i){
  i = 2*i-1
  plot_list = list()
  for(j in 1:2){
    plot_list[[j]] = ggplotly(plot_data %>% filter(gene.score==scores[i+j-1]) %>% ggplot(aes(MTcor, p, size=n)) + 
                geom_smooth(color='gray', se=FALSE) +
                geom_point(color=plot_data$Module[plot_data$gene.score==scores[i+j-1]], alpha=0.5, aes(id=Module)) +
                geom_hline(yintercept=mean(plot_data$p[plot_data$gene.score==scores[i+j-1]]), color='gray') +
                xlab('Module-Diagnosis correlation') + ylab('% of SFARI genes') +
                theme_minimal() + theme(legend.position = 'none'))
  }
  p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
    list(x = 0.2 , y = 1.05, text = paste0('SFARI score ', scores[i]), showarrow = F, xref='paper', yref='paper'),
    list(x = 0.8 , y = 1.05, text = paste0('SFARI score ', scores[i+1]), showarrow = F, xref='paper', yref='paper')))
  
  return(p)
}

plot_function(1)
plot_function(2)
plot_function(3)
rm(i, s, this_row, new_row, plot_function)




Module Eigengenes


Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control.

In both cases, the Eigengenes separate the behaviour between autism and control samples very clearly!

plot_EGs = function(module){

  plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME',module)], 'Diagnosis' = datMeta$Diagnosis)

  p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) + geom_boxplot() + theme_minimal() + theme(legend.position='none') +
                    ggtitle(paste0('Module ', module, '  (MTcor=',round(moduleTraitCor[paste0('ME',module),1],2),')'))
  return(p)
}

p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])

grid.arrange(p1, p2, nrow=1)

rm(plot_EGs, p1, p2)




Identifying important genes


Selecting the modules with the highest correlation to Diagnosis, and, from them, the genes with the highest module membership-(absolute) gene significance

*Ordered by \(\frac{MM+|GS|}{2}\)

There aren’t many SFARI genes in the top genes of each module, and all of the have a score of 4

create_table = function(module){
  top_genes = dataset %>% left_join(genes_info %>% dplyr::select(ID, external_gene_id), by='ID') %>% 
              dplyr::select(ID, external_gene_id, paste0('MM.',gsub('#','',module)), GS, gene.score) %>%
              filter(dataset$Module==module) %>% dplyr::rename('MM' = paste0('MM.',gsub('#','',module))) %>% 
              mutate(importance = (MM+abs(GS))/2) %>% arrange(by=-importance) %>% top_n(20)
  return(top_genes)
}

top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])

kable(top_genes[[1]], caption=paste0('Top 10 genes for module ', top_modules[1], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[1]),1],2),')'))
Top 10 genes for module #89AC00 (MTcor = 0.86)
ID external_gene_id MM GS gene.score importance
ENSG00000186918 ZNF395 0.7632453 0.8102193 None 0.7867323
ENSG00000102265 TIMP1 0.7909589 0.7806321 None 0.7857955
ENSG00000133321 RARRES3 0.7542158 0.7920390 None 0.7731274
ENSG00000155366 RHOC 0.8164991 0.7259850 None 0.7712420
ENSG00000105559 PLEKHA4 0.7387283 0.7852350 None 0.7619816
ENSG00000196136 SERPINA3 0.7671404 0.7068174 None 0.7369789
ENSG00000168899 VAMP5 0.7714065 0.6880403 None 0.7297234
ENSG00000197747 S100A10 0.6417222 0.8104358 None 0.7260790
ENSG00000132470 ITGB4 0.7394859 0.6896188 None 0.7145523
ENSG00000104447 TRPS1 0.6420689 0.7582752 None 0.7001721
ENSG00000099377 HSD3B7 0.7160660 0.6812622 None 0.6986641
ENSG00000067182 TNFRSF1A 0.7705214 0.6234085 None 0.6969649
ENSG00000139644 TMBIM6 0.7541536 0.6372137 None 0.6956837
ENSG00000198492 YTHDF2 0.7136371 0.6772049 None 0.6954210
ENSG00000176971 FIBIN 0.7006333 0.6854169 None 0.6930251
ENSG00000218891 ZNF579 0.6981817 0.6766413 None 0.6874115
ENSG00000152284 TCF7L1 0.7163302 0.6557512 None 0.6860407
ENSG00000025708 TYMP 0.6893407 0.6611546 None 0.6752477
ENSG00000159403 C1R 0.6643043 0.6843784 None 0.6743414
ENSG00000046653 GPM6B 0.6910618 0.6567226 None 0.6738922
kable(top_genes[[2]], caption=paste0('Top 10 genes for module ', top_modules[2], '  (MTcor = ',
                                  round(moduleTraitCor[paste0('ME',top_modules[2]),1],2),')'))
Top 10 genes for module #00C0BE (MTcor = -0.83)
ID external_gene_id MM GS gene.score importance
ENSG00000155093 PTPRN2 0.8675243 -0.8288965 None 0.8482104
ENSG00000134780 DAGLA 0.8357091 -0.7813236 4 0.8085164
ENSG00000160014 CALM3 0.8250775 -0.7302390 None 0.7776583
ENSG00000178233 TMEM151B 0.7791630 -0.7113656 None 0.7452643
ENSG00000108509 CAMTA2 0.7495079 -0.7281840 None 0.7388459
ENSG00000115504 EHBP1 0.7829185 -0.6933431 None 0.7381308
ENSG00000198853 RUSC2 0.6919524 -0.7777515 None 0.7348520
ENSG00000157064 NMNAT2 0.7480605 -0.7177706 None 0.7329155
ENSG00000176490 DIRAS1 0.6904759 -0.7680531 None 0.7292645
ENSG00000171314 PGAM1 0.7929100 -0.6463476 None 0.7196288
ENSG00000180155 LYNX1 0.6884093 -0.7484455 None 0.7184274
ENSG00000100938 GMPR2 0.7693403 -0.6620274 None 0.7156839
ENSG00000112186 CAP2 0.7323813 -0.6987556 None 0.7155684
ENSG00000070610 GBA2 0.7258861 -0.6995045 None 0.7126953
ENSG00000138834 MAPK8IP3 0.6817788 -0.7386141 None 0.7101964
ENSG00000160460 SPTBN4 0.6094721 -0.7856013 None 0.6975367
ENSG00000130477 UNC13A 0.6598345 -0.7186255 4 0.6892300
ENSG00000172379 ARNT2 0.7201537 -0.6534758 4 0.6868148
ENSG00000106603 COA1 0.7126829 -0.6604257 None 0.6865543
ENSG00000054803 CBLN4 0.7239141 -0.6402402 None 0.6820772
rm(create_table)
pca = datExpr %>% prcomp

ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(dataset, by='ID') %>% dplyr::select(ID, PC1, PC2, Module, gene.score) %>%
            mutate(color = ifelse(Module %in% top_modules, as.character(Module), 'gray')) %>%
            mutate(alpha = ifelse(color %in% top_modules & 
                                  ID %in% ids, 1, 0.1))

plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) + 
              theme_minimal() + ggtitle('Important genes identified through WGCNA')

Level of expression by Diagnosis for top genes, ordered by importance (defined above)

create_plot = function(i){
  
  plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>% 
              melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
              left_join(top_genes[[i]], by='ID') %>%
              left_join(datMeta %>% dplyr::select(ID, Diagnosis),
                        by = c('variable'='ID')) %>% arrange(desc(importance))
  
  p = ggplotly(plot_data %>% mutate(external_gene_id=factor(external_gene_id, 
                                    levels=unique(plot_data$external_gene_id), ordered=T)) %>%
               ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
                      xlab(paste0('Top genes for module ', top_modules[i], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[i]][1],2), ')')) + ylab('Level of Expression') +
                      theme(axis.text.x = element_text(angle = 90, hjust = 1)))
  return(p)
  
}

create_plot(1)
create_plot(2)
rm(create_plot)




Enrichment Analysis


Using the package anRichment

# Prepare dataset

# Create dataset with top modules membership and removing the genes without an assigned module
EA_dataset = data.frame('ensembl_gene_id' = genes_info$ID,
                        module = ifelse(genes_info$Module %in% top_modules, genes_info$Module, 'other')) %>%
             filter(genes_info$Module!='gray')

# Assign Entrez Gene Id to each gene
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=EA_dataset$ensembl_gene_id, mart=mart)

EA_dataset = EA_dataset %>% left_join(biomart_output, by='ensembl_gene_id')

for(tm in top_modules){
  cat(paste0('\n',sum(EA_dataset$module==tm & is.na(EA_dataset$entrezgene)), ' genes from top module ',
             tm, ' don\'t have an Entrez Gene ID')) 
}
## 
## 2 genes from top module #89AC00 don't have an Entrez Gene ID
## 9 genes from top module #00C0BE don't have an Entrez Gene ID
rm(getinfo, mart, biomart_output, tm)
# Manual: https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/GeneAnnotation/Tutorials/anRichment-Tutorial1.pdf

collectGarbage()

# Prepare datasets
GO_col = buildGOcollection(organism = 'human', verbose = 0)
## Loading required package: org.Hs.eg.db
## 
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
internal_col = internalCollection(organism = 'human')
MillerAIBS_col = MillerAIBSCollection(organism = 'human')
BrainDisease_col = BrainDiseaseCollection(organism = 'human')
combined_col = mergeCollections(GO_col, internal_col, MillerAIBS_col, BrainDisease_col)

# Print collections used
cat('Using collections: ')
## Using collections:
knownGroups(combined_col, sortBy = 'size')
##  [1] "GO"                                         
##  [2] "GO.BP"                                      
##  [3] "GO.MF"                                      
##  [4] "GO.CC"                                      
##  [5] "JA Miller at AIBS"                          
##  [6] "Chip-X enrichment analysis (ChEA)"          
##  [7] "Brain"                                      
##  [8] "JAM"                                        
##  [9] "Prenatal brain"                             
## [10] "Brain region markers"                       
## [11] "Cortex"                                     
## [12] "Brain region marker enriched gene sets"     
## [13] "WGCNA"                                      
## [14] "BrainRegionMarkers"                         
## [15] "BrainRegionMarkers.HBA"                     
## [16] "BrainRegionMarkers.HBA.localMarker(top200)" 
## [17] "Postnatal brain"                            
## [18] "ImmunePathways"                             
## [19] "Markers of cortex layers"                   
## [20] "BrainLists"                                 
## [21] "Cell type markers"                          
## [22] "Germinal brain"                             
## [23] "BrainRegionMarkers.HBA.globalMarker(top200)"
## [24] "Accelerated evolution"                      
## [25] "Postmitotic brain"                          
## [26] "BrainLists.Blalock_AD"                      
## [27] "BrainLists.DiseaseGenes"                    
## [28] "BloodAtlases"                               
## [29] "Verge Disease Genes"                        
## [30] "BloodAtlases.Whitney"                       
## [31] "BrainLists.JAXdiseaseGene"                  
## [32] "BrainLists.MO"                              
## [33] "Age-associated genes"                       
## [34] "BrainLists.Lu_Aging"                        
## [35] "Cell type marker enriched gene sets"        
## [36] "BrainLists.CA1vsCA3"                        
## [37] "BrainLists.MitochondrialType"               
## [38] "BrainLists.MO.2+_26Mar08"                   
## [39] "BrainLists.MO.Sugino"                       
## [40] "BloodAtlases.Gnatenko2"                     
## [41] "BloodAtlases.Kabanova"                      
## [42] "BrainLists.Voineagu"                        
## [43] "StemCellLists"                              
## [44] "StemCellLists.Lee"
# Perform Enrichment Analysis
enrichment = enrichmentAnalysis(classLabels = EA_dataset$module, identifiers = EA_dataset$entrezgene,
                                refCollection = combined_col, #useBackground = 'given', 
                                threshold = 1e-4, thresholdType = 'Bonferroni',
                                getOverlapEntrez = FALSE, getOverlapSymbols = TRUE)
##  enrichmentAnalysis: preparing data..
##   ..working on label set 1 ..


Results


kable(enrichment$enrichmentTable %>% filter(class==top_modules[1]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio,
                    effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[1], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[1]][1],4), ')'))
Enriched terms for module #89AC00 (MTcor = 0.8554)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
GO:0006955 immune response GO|GO.BP 0.0000007 0.0000007 2.364814 304 1262 68
JAMiller.AIBS.000143 Lowest in CP of 13-16 post-conception weeks human JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex 0.0000129 0.0000064 2.267671 304 1258 65
GO:0006952 defense response GO|GO.BP 0.0000509 0.0000129 2.433224 304 974 54
GO:0034341 response to interferon-gamma GO|GO.BP 0.0000517 0.0000129 6.123929 304 129 18
JAMiller.AIBS.000097 Cortical astrocytes JA Miller at AIBS|Brain|Postnatal brain|Cell type markers|Cortex 0.0001028 0.0000206 1.836240 304 2175 91
JAMiller.AIBS.000154 Highest in VZ of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex|Germinal brain 0.0001412 0.0000235 2.047345 304 1522 71
GO:0002376 immune system process GO|GO.BP 0.0005115 0.0000639 1.860822 304 1934 82
GO:0034097 response to cytokine GO|GO.BP 0.0014484 0.0001558 2.444266 304 808 45
GO:0045087 innate immune response GO|GO.BP 0.0015584 0.0001558 2.796414 304 565 36
GO:0002252 immune effector process GO|GO.BP 0.0031755 0.0002887 2.410835 304 801 44
kable(enrichment$enrichmentTable %>% filter(class==top_modules[2]) %>% 
      dplyr::select(dataSetID, shortDataSetName, inGroups, Bonferroni, FDR, enrichmentRatio,
                    effectiveClassSize, effectiveSetSize, nCommonGenes) %>%
      arrange(Bonferroni, desc(enrichmentRatio)),
      caption = paste0('Enriched terms for module ', top_modules[2], ' (MTcor = ',
                       round(genes_info$MTcor[genes_info$Module==top_modules[2]][1],4), ')'))
Enriched terms for module #00C0BE (MTcor = -0.8323)
dataSetID shortDataSetName inGroups Bonferroni FDR enrichmentRatio effectiveClassSize effectiveSetSize nCommonGenes
JAMiller.AIBS.000569 WGCNA humanSpecificOlivedrab2Module frontalCtx FOXP2 JA Miller at AIBS|Brain|Postnatal brain|Cortex|WGCNA 0.0003229 0.0000461 1.335144 757 3749 284
GO:0097458 neuron part GO|GO.CC 0.0088920 0.0006840 1.593735 757 1316 119
JAMiller.AIBS.000048 CortexWGCNA 15-21 post-conception weeks C22 CPenriched enrichedForAutismGenes JA Miller at AIBS|Brain|Prenatal brain|Cortex|WGCNA 0.0193707 0.0010761 1.928187 757 585 64
GO:0098793 presynapse GO|GO.CC 0.0894779 0.0035451 2.105824 757 385 46
JAMiller.AIBS.000150 Highest in CP of E14.5 mouse JA Miller at AIBS|Brain|Prenatal brain|Markers of cortex layers|Cortex|Postmitotic brain 0.0981017 0.0036334 1.566327 757 1204 107
GO:0061337 cardiac conduction GO|GO.BP 0.3425215 0.0103794 3.119440 757 113 20
GO:0044456 synapse part GO|GO.CC 0.4524577 0.0122286 1.686255 757 763 73
GO:0045202 synapse GO|GO.CC 0.4959038 0.0130501 1.592388 757 974 88
GO:0035637 multicellular organismal signaling GO|GO.BP 0.6671899 0.0162729 2.738995 757 148 23
GO:0070382 exocytic vesicle GO|GO.CC 1.0000000 0.0242656 2.649485 757 153 23

Save Enrichment Analysis results

save(enrichment, file='./../Data/enrichmentAnalysis.RData')
#load('./../Data/enrichmentAnalysis.RData')



Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.8.2                      
##  [2] BrainDiseaseCollection_1.00             
##  [3] anRichment_1.01-2                       
##  [4] TxDb.Mmusculus.UCSC.mm10.knownGene_3.4.7
##  [5] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 
##  [6] GenomicFeatures_1.36.4                  
##  [7] GenomicRanges_1.36.1                    
##  [8] GenomeInfoDb_1.20.0                     
##  [9] anRichmentMethods_0.90-1                
## [10] WGCNA_1.69                              
## [11] fastcluster_1.1.25                      
## [12] dynamicTreeCut_1.63-1                   
## [13] GO.db_3.8.2                             
## [14] AnnotationDbi_1.46.1                    
## [15] IRanges_2.18.3                          
## [16] S4Vectors_0.22.1                        
## [17] Biobase_2.44.0                          
## [18] BiocGenerics_0.30.0                     
## [19] biomaRt_2.40.5                          
## [20] knitr_1.28                              
## [21] doParallel_1.0.15                       
## [22] iterators_1.0.12                        
## [23] foreach_1.5.0                           
## [24] polycor_0.7-10                          
## [25] expss_0.10.2                            
## [26] GGally_1.5.0                            
## [27] gridExtra_2.3                           
## [28] viridis_0.5.1                           
## [29] viridisLite_0.3.0                       
## [30] RColorBrewer_1.1-2                      
## [31] dendextend_1.13.4                       
## [32] plotly_4.9.2                            
## [33] glue_1.3.2                              
## [34] reshape2_1.4.3                          
## [35] forcats_0.5.0                           
## [36] stringr_1.4.0                           
## [37] dplyr_0.8.5                             
## [38] purrr_0.3.3                             
## [39] readr_1.3.1                             
## [40] tidyr_1.0.2                             
## [41] tibble_3.0.0                            
## [42] ggplot2_3.3.0                           
## [43] tidyverse_1.3.0                         
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.4-0                 plyr_1.8.6                 
##   [5] lazyeval_0.2.2              splines_3.6.3              
##   [7] crosstalk_1.1.0.1           BiocParallel_1.18.1        
##   [9] digest_0.6.25               htmltools_0.4.0            
##  [11] fansi_0.4.1                 magrittr_1.5               
##  [13] checkmate_2.0.0             memoise_1.1.0              
##  [15] cluster_2.1.0               annotate_1.62.0            
##  [17] Biostrings_2.52.0           modelr_0.1.6               
##  [19] matrixStats_0.56.0          prettyunits_1.1.1          
##  [21] jpeg_0.1-8.1                colorspace_1.4-1           
##  [23] blob_1.2.1                  rvest_0.3.5                
##  [25] haven_2.2.0                 xfun_0.12                  
##  [27] crayon_1.3.4                RCurl_1.98-1.1             
##  [29] jsonlite_1.6.1              genefilter_1.66.0          
##  [31] impute_1.58.0               survival_3.1-11            
##  [33] gtable_0.3.0                zlibbioc_1.30.0            
##  [35] XVector_0.24.0              DelayedArray_0.10.0        
##  [37] scales_1.1.0                mvtnorm_1.1-0              
##  [39] DBI_1.1.0                   Rcpp_1.0.4                 
##  [41] xtable_1.8-4                progress_1.2.2             
##  [43] htmlTable_1.13.3            foreign_0.8-75             
##  [45] bit_1.1-15.2                preprocessCore_1.46.0      
##  [47] Formula_1.2-3               htmlwidgets_1.5.1          
##  [49] httr_1.4.1                  acepack_1.4.1              
##  [51] ellipsis_0.3.0              farver_2.0.3               
##  [53] pkgconfig_2.0.3             reshape_0.8.8              
##  [55] XML_3.99-0.3                nnet_7.3-13                
##  [57] dbplyr_1.4.2                locfit_1.5-9.4             
##  [59] labeling_0.3                tidyselect_1.0.0           
##  [61] rlang_0.4.5                 munsell_0.5.0              
##  [63] cellranger_1.1.0            tools_3.6.3                
##  [65] cli_2.0.2                   generics_0.0.2             
##  [67] RSQLite_2.2.0               broom_0.5.5                
##  [69] evaluate_0.14               yaml_2.2.1                 
##  [71] bit64_0.9-7                 fs_1.4.0                   
##  [73] nlme_3.1-144                xml2_1.2.5                 
##  [75] compiler_3.6.3              rstudioapi_0.11            
##  [77] curl_4.3                    png_0.1-7                  
##  [79] reprex_0.3.0                geneplotter_1.62.0         
##  [81] stringi_1.4.6               highr_0.8                  
##  [83] lattice_0.20-40             Matrix_1.2-18              
##  [85] vctrs_0.2.4                 pillar_1.4.3               
##  [87] lifecycle_0.2.0             data.table_1.12.8          
##  [89] bitops_1.0-6                rtracklayer_1.44.4         
##  [91] R6_2.4.1                    latticeExtra_0.6-29        
##  [93] codetools_0.2-16            assertthat_0.2.1           
##  [95] SummarizedExperiment_1.14.1 DESeq2_1.24.0              
##  [97] withr_2.1.2                 GenomicAlignments_1.20.1   
##  [99] Rsamtools_2.0.3             GenomeInfoDbData_1.2.1     
## [101] mgcv_1.8-31                 hms_0.5.3                  
## [103] grid_3.6.3                  rpart_4.1-15               
## [105] rmarkdown_2.1               lubridate_1.7.4            
## [107] base64enc_0.1-3